Final Project: Phase 2¶
Spring 2024
Group: Michael Massone and Joseph Nelson Farrell
DS 5230 Unsupervised Machine Learning
Professor Steven Morin, PhD
Due: 03/11/2024
Libraries¶
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import sys
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
import warnings
Define File Paths¶
In [ ]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)
# path to figs folder
figs_path = path + '/figs'
# path to data
data_path= path + '/data'
# path to src folder
src_path = path + '/src'
print(src_path)
# sys path
sys.path.append(src_path)
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src
Functions¶
In [ ]:
from preprocessing_eda_utils import missingness_cols
from preprocessing_eda_utils import column_dtypes
from preprocessing_eda_utils import compute_vif
from preprocessing_eda_utils import generate_column_hist
from preprocessing_eda_utils import sub_divide_pairplot
Parameters¶
In [ ]:
# design matrix csv file name
design_matrix_file_name = "/curated/beans_design.csv"
# missingness threshold for determining columns to drop
missingness_threshold = 0.20
# min frequency for nomical transformer; OneHotEncoder
min_frequency = 20
Load Data¶
In [ ]:
# read in design matrix
design_df = pd.read_csv( data_path + design_matrix_file_name)
In [ ]:
# check columns for missingness
missingness_drop_list = missingness_cols(design_df, missingness_threshold)
# display results
if missingness_drop_list == []:
print(f'There are no columns that require dropping based on missingness')
else:
print(f'The following columns have a missingness proportion greater than {missingness_threshold}.')
print(f'These columns should be dropped:')
for i in range(len(missingness_drop_list)):
print(f'\t{i + 1}. {missingness_drop_list[i]}')
There are no columns that require dropping based on missingness
Visually Inspect Design Matrix:¶
In [ ]:
design_df.head(10)
Out[Â ]:
| Unnamed: 0 | id | Area | Perimeter | MajorAxisLength | MinorAxisLength | AspectRation | Eccentricity | ConvexArea | EquivDiameter | Extent | Solidity | roundness | Compactness | ShapeFactor1 | ShapeFactor2 | ShapeFactor3 | ShapeFactor4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 28395.0 | 610.291 | 208.178117 | 173.888747 | 1.197191 | 0.549812 | 28715.0 | 190.141097 | 0.763923 | 0.988856 | 0.958027 | 0.913358 | 0.007332 | 0.003147 | 0.834222 | 0.998724 |
| 1 | 1 | 1 | 28734.0 | 638.018 | 200.524796 | 182.734419 | 1.097356 | 0.411785 | 29172.0 | 191.272750 | 0.783968 | 0.984986 | 0.887034 | 0.953861 | 0.006979 | 0.003564 | 0.909851 | 0.998430 |
| 2 | 2 | 2 | 29380.0 | 624.110 | 212.826130 | 175.931143 | 1.209713 | 0.562727 | 29690.0 | 193.410904 | 0.778113 | 0.989559 | 0.947849 | 0.908774 | 0.007244 | 0.003048 | 0.825871 | 0.999066 |
| 3 | 3 | 3 | 30008.0 | 645.884 | 210.557999 | 182.516516 | 1.153638 | 0.498616 | 30724.0 | 195.467062 | 0.782681 | 0.976696 | 0.903936 | 0.928329 | 0.007017 | 0.003215 | 0.861794 | 0.994199 |
| 4 | 4 | 4 | 30140.0 | 620.134 | 201.847882 | 190.279279 | 1.060798 | 0.333680 | 30417.0 | 195.896503 | 0.773098 | 0.990893 | 0.984877 | 0.970516 | 0.006697 | 0.003665 | 0.941900 | 0.999166 |
| 5 | 5 | 5 | 30279.0 | 634.927 | 212.560556 | 181.510182 | 1.171067 | 0.520401 | 30600.0 | 196.347702 | 0.775688 | 0.989510 | 0.943852 | 0.923726 | 0.007020 | 0.003153 | 0.853270 | 0.999236 |
| 6 | 6 | 6 | 30477.0 | 670.033 | 211.050155 | 184.039050 | 1.146768 | 0.489478 | 30970.0 | 196.988633 | 0.762402 | 0.984081 | 0.853080 | 0.933374 | 0.006925 | 0.003242 | 0.871186 | 0.999049 |
| 7 | 7 | 7 | 30519.0 | 629.727 | 212.996755 | 182.737204 | 1.165591 | 0.513760 | 30847.0 | 197.124320 | 0.770682 | 0.989367 | 0.967109 | 0.925480 | 0.006979 | 0.003158 | 0.856514 | 0.998345 |
| 8 | 8 | 8 | 30685.0 | 635.681 | 213.534145 | 183.157146 | 1.165852 | 0.514081 | 31044.0 | 197.659696 | 0.771561 | 0.988436 | 0.954240 | 0.925658 | 0.006959 | 0.003152 | 0.856844 | 0.998953 |
| 9 | 9 | 9 | 30834.0 | 631.934 | 217.227813 | 180.897469 | 1.200834 | 0.553642 | 31120.0 | 198.139012 | 0.783683 | 0.990810 | 0.970278 | 0.912125 | 0.007045 | 0.003008 | 0.831973 | 0.999061 |
Identify Feature Types: Nominal & Numerical¶
In [ ]:
# get nominal and numerical columns, print column dtype
nominal_cols, numerical_cols, unique_value_cols = column_dtypes(design_df)
# display nominal cols
if nominal_cols == []:
print(f'There no nominal columns in the dataset.\n')
else:
print(f'The nominal columns are:')
for i in range(len(nominal_cols)):
print(f'{i + 1}: {nominal_cols[i]}')
# display numerical cols
if numerical_cols == []:
print(f'There no numerical columns in the dataset.')
else:
print(f'The {len(numerical_cols)} numerical columns are:')
for i in range(len(numerical_cols)):
print(f'\t{i + 1}: {numerical_cols[i]}')
# display unique values information
if unique_value_cols == []:
print(f'\nThere no columns with all unique values in the dataset.')
else:
print(f'\nThe {len(unique_value_cols)} columns with all unique values are:')
for i in range(len(unique_value_cols)):
print(f'\t{i + 1}: {unique_value_cols[i]}')
Column: Unnamed: 0 Data Type: int64 Unique value count: 13611, DF length: 13611, Ratio: 1.0 *** FLAG column Unnamed: 0 for review *** __________________________________________________________ Column: id Data Type: int64 Unique value count: 13611, DF length: 13611, Ratio: 1.0 *** FLAG column id for review *** __________________________________________________________ Column: Area Data Type: float64 Unique value count: 12011, DF length: 13611, Ratio: 0.88 __________________________________________________________ Column: Perimeter Data Type: float64 Unique value count: 13413, DF length: 13611, Ratio: 0.99 __________________________________________________________ Column: MajorAxisLength Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: MinorAxisLength Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: AspectRation Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: Eccentricity Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: ConvexArea Data Type: float64 Unique value count: 12066, DF length: 13611, Ratio: 0.89 __________________________________________________________ Column: EquivDiameter Data Type: float64 Unique value count: 12011, DF length: 13611, Ratio: 0.88 __________________________________________________________ Column: Extent Data Type: float64 Unique value count: 13535, DF length: 13611, Ratio: 0.99 __________________________________________________________ Column: Solidity Data Type: float64 Unique value count: 13526, DF length: 13611, Ratio: 0.99 __________________________________________________________ Column: roundness Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: Compactness Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: ShapeFactor1 Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: ShapeFactor2 Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: ShapeFactor3 Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ Column: ShapeFactor4 Data Type: float64 Unique value count: 13543, DF length: 13611, Ratio: 1.0 __________________________________________________________ There no nominal columns in the dataset. The 18 numerical columns are: 1: Unnamed: 0 2: id 3: Area 4: Perimeter 5: MajorAxisLength 6: MinorAxisLength 7: AspectRation 8: Eccentricity 9: ConvexArea 10: EquivDiameter 11: Extent 12: Solidity 13: roundness 14: Compactness 15: ShapeFactor1 16: ShapeFactor2 17: ShapeFactor3 18: ShapeFactor4 The 2 columns with all unique values are: 1: Unnamed: 0 2: id
Create Non Machine Learning Attributes List¶
In [ ]:
# display columns
design_df.columns
Out[Â ]:
Index(['Unnamed: 0', 'id', 'Area', 'Perimeter', 'MajorAxisLength',
'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea',
'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness',
'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4'],
dtype='object')
In [ ]:
# attributes not to be used for machine learning
non_ml_attr_list = ['id']
for col in non_ml_attr_list:
if col in nominal_cols:
nominal_cols.remove(col)
if col in numerical_cols:
numerical_cols.remove(col)
# instaniate ml_ignore_list
ml_ignore_list = [missingness_drop_list + non_ml_attr_list]
# add any attributes that should be dropped from dataframe
drop_list = ['Unnamed: 0'] # 'Unnamed: 0' is a secondary id column and can be dropped.
# drop cols in drop_list from design matrix
cols1 = design_df.columns
design_df.drop(drop_list, axis = 1, inplace = True)
assert(len(design_df.columns) != len(cols1))
# drop col names in drop_list from nominal and numerical lists
for col in drop_list:
if col in nominal_cols:
nominal_cols.remove(col)
if col in numerical_cols:
numerical_cols.remove(col)
assert(design_df.shape[1] == len(non_ml_attr_list) + len(numerical_cols) + len(nominal_cols))
# display finalized ML attributes information
print(f'Training X Dimensions: \n \t{design_df[numerical_cols + nominal_cols].shape}')
print('Non Machine Learning Attributes:')
for i, element in enumerate(non_ml_attr_list):
print(f'\t {i+1}: {element}')
print('Attributes Ignored in Machine Learning:')
for i, element in enumerate(ml_ignore_list):
print(f'\t {i+1}: {element}')
print('Numerical Columns:')
if numerical_cols == []:
print(f'\t None')
else:
for i, element in enumerate(numerical_cols):
print(f'\t {i+1}: {element}')
print('Nominal Columns:')
if nominal_cols == []:
print(f'\t None')
else:
for i, element in enumerate(nominal_cols):
print(f'\t {i+1}: {element}')
Training X Dimensions: (13611, 16) Non Machine Learning Attributes: 1: id Attributes Ignored in Machine Learning: 1: ['id'] Numerical Columns: 1: Area 2: Perimeter 3: MajorAxisLength 4: MinorAxisLength 5: AspectRation 6: Eccentricity 7: ConvexArea 8: EquivDiameter 9: Extent 10: Solidity 11: roundness 12: Compactness 13: ShapeFactor1 14: ShapeFactor2 15: ShapeFactor3 16: ShapeFactor4 Nominal Columns: None
Feature Destriptions
Area(A) - float64 - Ratio - The area of a bean zone and the number of pixels within its boundaries.Perimeter(P) - float64 - Ratio - Bean circumference is defined as the length of its border.MajorAxisLength(L) - float64 - Ratio - The distance between the ends of the longest line that can be drawn from a bean.MinorAxisLength(l) - float64 - Ratio - The longest line that can be drawn from the bean while standing perpendicular to the main axis.AspectRatio(K) - float64 - Ratio - Defines the relationship between L and l.Eccentricity(Ec) - float64 - Ratio - Eccentricity of the ellipse having the same moments as the region.ConvexArea(C) - float64 - Ratio - Number of pixels in the smallest convex polygon that can contain the area of a bean seed.EquivDiameter(Ed) - float64 - Ratio - The diameter of a circle having the same area as a bean seed area.Extent(Ex) - float64 - Ratio - The ratio of the pixels in the bounding box to the bean area.Solidity(S) - float64 - Ratio - Also known as convexity. The ratio of the pixels in the convex shell to those found in beans.Roundness(R) - float64 - Ratio - Calculated with the following formula: $\dfrac{(4 \pi A)}{(P^2)}$Compactness(CO) - float64 - Ratio - Measures the roundness of an object: $\dfrac{Ed}{L}$ShapeFactor1(SF1) - float64 - Ratio - $\dfrac{L}{A}$ShapeFactor2(SF2) - float64 - Ratio - $\dfrac{l}{A}$ShapeFactor3(SF3) - float64 - Ratio - $\dfrac{A}{\frac{L}{2}\cdot \frac{L}{2} \cdot \pi }$ShapeFactor4(SF4) - float64 - Ratio $\dfrac{A}{\frac{L}{2}\cdot \frac{l}{2} \cdot \pi }$
Display Missingness of Machine Learning Attributes:¶
In [ ]:
# display missingness plot; ml attributes
msno.matrix(design_df[numerical_cols + nominal_cols])
plt.title("Missingness: Machine Learning Attributes", weight = 'bold', fontsize = 24)
# save fig
plt.savefig(figs_path + "/missing_attributes_ml_attributes.png", bbox_inches = 'tight')
In [ ]:
print('Machine Learning Attributes Datatypes:')
design_df[numerical_cols + nominal_cols].dtypes
Machine Learning Attributes Datatypes:
Out[Â ]:
Area float64 Perimeter float64 MajorAxisLength float64 MinorAxisLength float64 AspectRation float64 Eccentricity float64 ConvexArea float64 EquivDiameter float64 Extent float64 Solidity float64 roundness float64 Compactness float64 ShapeFactor1 float64 ShapeFactor2 float64 ShapeFactor3 float64 ShapeFactor4 float64 dtype: object
Explore Correlation and VIF of Numerical Attributes:¶
In [ ]:
# set figure size
plt.figure(figsize = (10, 7))
# generate a mask to only show the bottom triangle
mask = np.triu(np.ones_like(design_df[numerical_cols].corr(), dtype=bool))
# generate heatmap
sns.heatmap(design_df[numerical_cols].corr(), annot = True, fmt = ".2f", mask = mask, vmin = -1, vmax = 1)
plt.title('Correlation Matrix of the Design Matrix', weight = 'bold', fontsize = 16)
# save fig
plt.savefig(figs_path + "/correlation_matrix.png", bbox_inches = 'tight')
VIF Factor: All Numerical Attributes¶
In [ ]:
vif = compute_vif(design_df, numerical_cols)
Variance Inflation Factors Above Threshold(5):
Variable VIF
0 Area 81390.772240
1 Perimeter 3573.980197
2 MajorAxisLength 87936.513102
3 MinorAxisLength 77482.673330
4 AspectRation 13918.856718
5 Eccentricity 1183.004420
6 ConvexArea 78552.516774
7 EquivDiameter 314929.768895
9 Solidity 14.053550
10 roundness 104.028512
11 Compactness 276769.624251
12 ShapeFactor1 607.175671
13 ShapeFactor2 1245.330594
14 ShapeFactor3 200286.806522
15 ShapeFactor4 67.043776
Variance Inflation Factors Below Threshold(5):
Variable VIF
8 Extent 1.241536
VIF Factor without Shapefactor Attributes¶
In [ ]:
vif = compute_vif(design_df, numerical_cols[:12])
Variance Inflation Factors Above Threshold(5):
Variable VIF
0 Area 55455.469502
1 Perimeter 3369.110823
2 MajorAxisLength 8280.492793
3 MinorAxisLength 5661.863735
4 AspectRation 430.562583
5 Eccentricity 176.949307
6 ConvexArea 55945.114110
7 EquivDiameter 26144.294906
9 Solidity 10.649496
10 roundness 98.457059
11 Compactness 999.791946
Variance Inflation Factors Below Threshold(5):
Variable VIF
8 Extent 1.238184
Numerical Transformer: Impute & Scale¶
In [ ]:
# instantiate sklearn pipeline
numerical_transformer = Pipeline(
steps=[
('imputer', SimpleImputer()),
('scaler', StandardScaler())
]
)
In [ ]:
if len(nominal_cols) > 0:
print(design_df[nominal_cols].value_count())
print('Set min_frequency for nominal transformer in perameters cell.')
else:
min_frequency = None
Nominal Transformer¶
In [ ]:
# instantiate nominal transformer
nominal_transformer = Pipeline(
steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot_encoder', OneHotEncoder(sparse_output=False, min_frequency=min_frequency))
]
)
Column Transformer:¶
In [ ]:
preprocessor = ColumnTransformer(
transformers=[
('numerical', numerical_transformer, numerical_cols),
('nominal', nominal_transformer, nominal_cols)
]
)
Execute Pipeline with Design Matrix¶
In [ ]:
# execute pipeline
trans_df = pd.DataFrame(
data = preprocessor.fit_transform(design_df),
index = design_df.index, # KEEP TRACK OF INDEX
columns = preprocessor.get_feature_names_out() #NAME COlumns
)
In [ ]:
# display transfored dataframe
trans_df
Out[Â ]:
| numerical__Area | numerical__Perimeter | numerical__MajorAxisLength | numerical__MinorAxisLength | numerical__AspectRation | numerical__Eccentricity | numerical__ConvexArea | numerical__EquivDiameter | numerical__Extent | numerical__Solidity | numerical__roundness | numerical__Compactness | numerical__ShapeFactor1 | numerical__ShapeFactor2 | numerical__ShapeFactor3 | numerical__ShapeFactor4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.840749 | -1.143319 | -1.306598 | -0.631153 | -1.565053 | -2.185720 | -0.841451 | -1.063341 | 0.289087 | 0.367613 | 1.423867 | 1.839116 | 0.680786 | 2.402173 | 1.925723 | 0.838371 |
| 1 | -0.829188 | -1.013924 | -1.395911 | -0.434445 | -1.969784 | -3.686040 | -0.826102 | -1.044217 | 0.697477 | -0.462907 | 0.231054 | 2.495449 | 0.367967 | 3.100893 | 2.689702 | 0.771138 |
| 2 | -0.807157 | -1.078829 | -1.252357 | -0.585735 | -1.514291 | -2.045336 | -0.808704 | -1.008084 | 0.578195 | 0.518417 | 1.252865 | 1.764843 | 0.603129 | 2.235091 | 1.841356 | 0.916755 |
| 3 | -0.785741 | -0.977215 | -1.278825 | -0.439290 | -1.741618 | -2.742211 | -0.773975 | -0.973337 | 0.671260 | -2.241767 | 0.515049 | 2.081715 | 0.401718 | 2.515075 | 2.204250 | -0.197985 |
| 4 | -0.781239 | -1.097384 | -1.380471 | -0.266663 | -2.117993 | -4.535028 | -0.784286 | -0.966080 | 0.476020 | 0.804772 | 1.874992 | 2.765330 | 0.118268 | 3.270983 | 3.013462 | 0.939640 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 13606 | -0.373471 | -0.446083 | -0.366669 | -0.363055 | -0.123703 | 0.153343 | -0.378191 | -0.364148 | -0.716284 | 0.684173 | 0.727872 | 0.032433 | 0.261425 | 0.055630 | -0.006086 | 0.760813 |
| 13607 | -0.373334 | -0.456336 | -0.450053 | -0.257015 | -0.432979 | -0.165141 | -0.378662 | -0.363962 | 1.022933 | 0.774384 | 0.818807 | 0.362794 | 0.110384 | 0.285117 | 0.328393 | 0.722659 |
| 13608 | -0.372038 | -0.447833 | -0.450478 | -0.246456 | -0.448618 | -0.182940 | -0.376143 | -0.362197 | -0.403392 | 0.591370 | 0.758468 | 0.370533 | 0.104269 | 0.289204 | 0.336328 | 0.390251 |
| 13609 | -0.371765 | -0.427029 | -0.428974 | -0.266742 | -0.380735 | -0.106960 | -0.372851 | -0.361825 | -0.903414 | 0.143717 | 0.581753 | 0.285098 | 0.141906 | 0.228375 | 0.248973 | 0.036440 |
| 13610 | -0.371356 | -0.387557 | -0.291736 | -0.446224 | 0.148374 | 0.389116 | -0.375101 | -0.361268 | 0.799227 | 0.537539 | 0.253681 | -0.240904 | 0.387512 | -0.127776 | -0.276481 | 0.713719 |
13611 rows × 16 columns
Exploratory Data Analysis¶
In [ ]:
# generate attributes list
attr_list = trans_df.columns
# display machine learning attributes list
print(f'Attributes List:')
for i, j in enumerate(attr_list):
print(f'{i + 1}: {j}')
Attributes List: 1: numerical__Area 2: numerical__Perimeter 3: numerical__MajorAxisLength 4: numerical__MinorAxisLength 5: numerical__AspectRation 6: numerical__Eccentricity 7: numerical__ConvexArea 8: numerical__EquivDiameter 9: numerical__Extent 10: numerical__Solidity 11: numerical__roundness 12: numerical__Compactness 13: numerical__ShapeFactor1 14: numerical__ShapeFactor2 15: numerical__ShapeFactor3 16: numerical__ShapeFactor4
In [ ]:
# display dimension of transformed dataframe
print(f'Transformed Dataframe Dimensions:', trans_df[attr_list].shape)
Transformed Dataframe Dimensions: (13611, 16)
In [ ]:
# visually inspect transformed df
display(trans_df.head())
| numerical__Area | numerical__Perimeter | numerical__MajorAxisLength | numerical__MinorAxisLength | numerical__AspectRation | numerical__Eccentricity | numerical__ConvexArea | numerical__EquivDiameter | numerical__Extent | numerical__Solidity | numerical__roundness | numerical__Compactness | numerical__ShapeFactor1 | numerical__ShapeFactor2 | numerical__ShapeFactor3 | numerical__ShapeFactor4 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.840749 | -1.143319 | -1.306598 | -0.631153 | -1.565053 | -2.185720 | -0.841451 | -1.063341 | 0.289087 | 0.367613 | 1.423867 | 1.839116 | 0.680786 | 2.402173 | 1.925723 | 0.838371 |
| 1 | -0.829188 | -1.013924 | -1.395911 | -0.434445 | -1.969784 | -3.686040 | -0.826102 | -1.044217 | 0.697477 | -0.462907 | 0.231054 | 2.495449 | 0.367967 | 3.100893 | 2.689702 | 0.771138 |
| 2 | -0.807157 | -1.078829 | -1.252357 | -0.585735 | -1.514291 | -2.045336 | -0.808704 | -1.008084 | 0.578195 | 0.518417 | 1.252865 | 1.764843 | 0.603129 | 2.235091 | 1.841356 | 0.916755 |
| 3 | -0.785741 | -0.977215 | -1.278825 | -0.439290 | -1.741618 | -2.742211 | -0.773975 | -0.973337 | 0.671260 | -2.241767 | 0.515049 | 2.081715 | 0.401718 | 2.515075 | 2.204250 | -0.197985 |
| 4 | -0.781239 | -1.097384 | -1.380471 | -0.266663 | -2.117993 | -4.535028 | -0.784286 | -0.966080 | 0.476020 | 0.804772 | 1.874992 | 2.765330 | 0.118268 | 3.270983 | 3.013462 | 0.939640 |
In [ ]:
# display transformed df general information
trans_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 13611 entries, 0 to 13610 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 numerical__Area 13611 non-null float64 1 numerical__Perimeter 13611 non-null float64 2 numerical__MajorAxisLength 13611 non-null float64 3 numerical__MinorAxisLength 13611 non-null float64 4 numerical__AspectRation 13611 non-null float64 5 numerical__Eccentricity 13611 non-null float64 6 numerical__ConvexArea 13611 non-null float64 7 numerical__EquivDiameter 13611 non-null float64 8 numerical__Extent 13611 non-null float64 9 numerical__Solidity 13611 non-null float64 10 numerical__roundness 13611 non-null float64 11 numerical__Compactness 13611 non-null float64 12 numerical__ShapeFactor1 13611 non-null float64 13 numerical__ShapeFactor2 13611 non-null float64 14 numerical__ShapeFactor3 13611 non-null float64 15 numerical__ShapeFactor4 13611 non-null float64 dtypes: float64(16) memory usage: 1.7 MB
In [ ]:
# display counts of NA, None, and np.nan
print('\nNA (np.nan or None) Count:\n',
trans_df.isna().sum(), sep ='')
NA (np.nan or None) Count: numerical__Area 0 numerical__Perimeter 0 numerical__MajorAxisLength 0 numerical__MinorAxisLength 0 numerical__AspectRation 0 numerical__Eccentricity 0 numerical__ConvexArea 0 numerical__EquivDiameter 0 numerical__Extent 0 numerical__Solidity 0 numerical__roundness 0 numerical__Compactness 0 numerical__ShapeFactor1 0 numerical__ShapeFactor2 0 numerical__ShapeFactor3 0 numerical__ShapeFactor4 0 dtype: int64
In [ ]:
# display proportion of NA, None, and np.nan
print('\nNA (np.nan or None) Ratio:\n',
trans_df.isna().sum() / trans_df.shape[0], sep='')
NA (np.nan or None) Ratio: numerical__Area 0.0 numerical__Perimeter 0.0 numerical__MajorAxisLength 0.0 numerical__MinorAxisLength 0.0 numerical__AspectRation 0.0 numerical__Eccentricity 0.0 numerical__ConvexArea 0.0 numerical__EquivDiameter 0.0 numerical__Extent 0.0 numerical__Solidity 0.0 numerical__roundness 0.0 numerical__Compactness 0.0 numerical__ShapeFactor1 0.0 numerical__ShapeFactor2 0.0 numerical__ShapeFactor3 0.0 numerical__ShapeFactor4 0.0 dtype: float64
Pairplots¶
In [ ]:
# ignore warnings
warnings.filterwarnings('ignore', category = FutureWarning)
# generate plot
plot = sns.pairplot(data = trans_df)
plot.fig.suptitle('Attribute Pairplots', fontsize = 60, weight = 'bold', style = "italic", y = 1.03)
plt.tight_layout()
# save fig
plt.savefig(figs_path + f"/pairplot_full.png", bbox_inches = 'tight')
In [ ]:
sub_divide_pairplot(trans_df)
Histograms of Numerical Columns¶
In [ ]:
# divide numeric columns into groups of 4 for plotting
divided_columns = [attr_list[i:i+4] for i in range(0, len(attr_list), 4)]
# generate plots
for i, cols in enumerate(divided_columns):
generate_column_hist(trans_df, cols)
# save fig
plt.savefig(figs_path + f"/attribute_hist_{i}.png", bbox_inches = 'tight')